# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#' Read a CSV or other delimited file with Arrow
#'
#' These functions uses the Arrow C++ CSV reader to read into a `data.frame`.
#' Arrow C++ options have been mapped to argument names that follow those of
#' `readr::read_delim()`, and `col_select` was inspired by `vroom::vroom()`.
#'
#' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around
#' `read_delim_arrow()` that specify a delimiter.
#'
#' Note that not all `readr` options are currently implemented here. Please file
#' an issue if you encounter one that `arrow` should support.
#'
#' If you need to control Arrow-specific reader parameters that don't have an
#' equivalent in `readr::read_csv()`, you can either provide them in the
#' `parse_options`, `convert_options`, or `read_options` arguments, or you can
#' call [csv_table_reader()] directly for lower-level access.
#'
#' @param file A character path to a local file, or an Arrow input stream
#' @param delim Single character used to separate fields within a record.
#' @param quote Single character used to quote strings.
#' @param escape_double Does the file escape quotes by doubling them?
#' i.e. If this option is `TRUE`, the value `""""` represents
#' a single quote, `\"`.
#' @param escape_backslash Does the file use backslashes to escape special
#' characters? This is more general than `escape_double` as backslashes
#' can be used to escape the delimiter character, the quote character, or
#' to add special characters like `\\n`.
# #' @param col_names If `TRUE`, the first row of the input will be used as the
# #' column names and will not be included in the data frame. Note that `FALSE`
# #' is not currently supported, nor is specifying a character vector of column
# #' names.
#' @param col_select A [tidy selection specification][tidyselect::vars_select]
#' of columns, as used in `dplyr::select()`.
#' @param skip_empty_rows Should blank rows be ignored altogether? If
#' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be
#' filled with missings.
# #' @param skip Number of lines to skip before reading data.
#' @param parse_options see [csv_parse_options()]. If given, this overrides any
#' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
#' @param convert_options see [csv_convert_options()]
#' @param read_options see [csv_read_options()]
#' @param as_tibble Should the function return a `data.frame` or an
#' [arrow::Table][arrow__Table]?
#'
#' @return A `data.frame`, or an `arrow::Table` if `as_tibble = FALSE`.
#' @export
read_delim_arrow <- function(file,
                             delim = ",",
                             quote = '"',
                             escape_double = TRUE,
                             escape_backslash = FALSE,
                             # col_names = TRUE,
                             # col_types = TRUE,
                             col_select = NULL,
                             # na = c("", "NA"),
                             # quoted_na = TRUE,
                             skip_empty_rows = TRUE,
                             # skip = 0L,
                             parse_options = NULL,
                             convert_options = NULL,
                             read_options = csv_read_options(),
                             as_tibble = TRUE) {

  # These are hardcoded pending https://issues.apache.org/jira/browse/ARROW-5747
  col_names <- TRUE
  skip <- 0L

  if (is.null(parse_options)) {
    if (isTRUE(col_names)) {
      # Add one row to skip, to match arrow's header_rows
      skip <- skip + 1L
      # Note that with the hardcoding, header_rows is always 1, which
      # turns out to be the only value that works meaningfully
    }
    parse_options <- readr_to_csv_parse_options(
      delim,
      quote,
      escape_double,
      escape_backslash,
      skip_empty_rows,
      skip
    )
  }

  if (is.null(convert_options)) {
    # TODO:
    # * na strings (needs wiring in csv_convert_options)
    # * col_types (needs wiring in csv_convert_options). Note that we can't do
    # col_types if col_names is strings because the column type specification
    # requires a map of name: type, but the CSV reader doesn't handle user-
    # provided names--they're renamed after the fact.
    convert_options <- csv_convert_options()
  }

  reader <- csv_table_reader(
    file,
    read_options = read_options,
    parse_options = parse_options,
    convert_options = convert_options
  )

  tab <- reader$Read()$select(!!enquo(col_select))
  if (is.character(col_names)) {
    # TODO: Rename `tab`'s columns
    # See https://github.com/apache/arrow/pull/4557
  }

  if (isTRUE(as_tibble)) {
    tab <- as.data.frame(tab)
  }

  tab
}

#' @rdname read_delim_arrow
#' @export
read_csv_arrow <- function(file,
                           quote = '"',
                           escape_double = TRUE,
                           escape_backslash = FALSE,
                           # col_names = TRUE,
                           # col_types = TRUE,
                           col_select = NULL,
                           # na = c("", "NA"),
                           # quoted_na = TRUE,
                           skip_empty_rows = TRUE,
                           # skip = 0L,
                           parse_options = NULL,
                           convert_options = NULL,
                           read_options = csv_read_options(),
                           as_tibble = TRUE) {

  mc <- match.call()
  mc$delim <- ","
  mc[[1]] <- as.name("read_delim_arrow")
  eval.parent(mc)
}

#' @rdname read_delim_arrow
#' @export
read_tsv_arrow <- function(file,
                           quote = '"',
                           escape_double = TRUE,
                           escape_backslash = FALSE,
                           # col_names = TRUE,
                           # col_types = TRUE,
                           col_select = NULL,
                           # na = c("", "NA"),
                           # quoted_na = TRUE,
                           skip_empty_rows = TRUE,
                           # skip = 0L,
                           parse_options = NULL,
                           convert_options = NULL,
                           read_options = csv_read_options(),
                           as_tibble = TRUE) {

  mc <- match.call()
  mc$delim <- "\t"
  mc[[1]] <- as.name("read_delim_arrow")
  eval.parent(mc)
}

#' @include R6.R

`arrow::csv::TableReader` <- R6Class("arrow::csv::TableReader", inherit = `arrow::Object`,
  public = list(
    Read = function() shared_ptr(`arrow::Table`, csv___TableReader__Read(self))
  )
)

`arrow::csv::ReadOptions` <- R6Class("arrow::csv::ReadOptions", inherit = `arrow::Object`)
`arrow::csv::ParseOptions` <- R6Class("arrow::csv::ParseOptions", inherit = `arrow::Object`)
`arrow::csv::ConvertOptions` <- R6Class("arrow::csv::ConvertOptions", inherit = `arrow::Object`)

#' Read options for the Arrow file readers
#'
#' @param use_threads Whether to use the global CPU thread pool
#' @param block_size Block size we request from the IO layer; also determines the size of chunks when use_threads is `TRUE`. NB: if false, JSON input must end with an empty line
#'
#' @export
csv_read_options <- function(use_threads = option_use_threads(),
                             block_size = 1048576L) {
  shared_ptr(`arrow::csv::ReadOptions`, csv___ReadOptions__initialize(
    list(
      use_threads = use_threads,
      block_size = block_size
    )
  ))
}

readr_to_csv_parse_options <- function(delim = ",",
                                       quote = '"',
                                       escape_double = TRUE,
                                       escape_backslash = FALSE,
                                       skip_empty_rows = TRUE,
                                       skip = 0L) {
  # This function translates from the readr argument list to the arrow arg names
  # TODO: validate inputs
  csv_parse_options(
    delimiter = delim,
    quoting = nzchar(quote),
    quote_char = quote,
    double_quote = escape_double,
    escaping = escape_backslash,
    escape_char = '\\',
    newlines_in_values = escape_backslash,
    ignore_empty_lines = skip_empty_rows,
    header_rows = skip
  )
}

#' Parsing options for Arrow file readers
#'
#' @param delimiter Field delimiter
#' @param quoting Whether quoting is used
#' @param quote_char Quoting character (if `quoting` is `TRUE`)
#' @param double_quote Whether a quote inside a value is double-quoted
#' @param escaping Whether escaping is used
#' @param escape_char Escaping character (if `escaping` is `TRUE`)
#' @param newlines_in_values Whether values are allowed to contain CR (`0x0d`) and LF (`0x0a`) characters
#' @param ignore_empty_lines Whether empty lines are ignored.  If `FALSE`, an empty line represents
#' @param header_rows Number of header rows to skip (including the first row containing column names)
#'
#' @export
csv_parse_options <- function(delimiter = ",",
                              quoting = TRUE,
                              quote_char = '"',
                              double_quote = TRUE,
                              escaping = FALSE,
                              escape_char = '\\',
                              newlines_in_values = FALSE,
                              ignore_empty_lines = TRUE,
                              header_rows = 1L) {

  shared_ptr(`arrow::csv::ParseOptions`, csv___ParseOptions__initialize(
    list(
      delimiter = delimiter,
      quoting = quoting,
      quote_char = quote_char,
      double_quote = double_quote,
      escaping = escaping,
      escape_char = escape_char,
      newlines_in_values = newlines_in_values,
      ignore_empty_lines = ignore_empty_lines,
      header_rows = header_rows
    )
  ))
}

#' Conversion options for the CSV reader
#'
#' @param check_utf8 Whether to check UTF8 validity of string columns
#'
#' @export
csv_convert_options <- function(check_utf8 = TRUE) {
  # TODO: there are more conversion options available:
  # // Optional per-column types (disabling type inference on those columns)
  # std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
  # // Recognized spellings for null values
  # std::vector<std::string> null_values;
  # // Recognized spellings for boolean values
  # std::vector<std::string> true_values;
  # std::vector<std::string> false_values;
  # // Whether string / binary columns can have null values.
  # // If true, then strings in "null_values" are considered null for string columns.
  # // If false, then all strings are valid string values.
  # bool strings_can_be_null = false;

  shared_ptr(`arrow::csv::ConvertOptions`, csv___ConvertOptions__initialize(
    list(
      check_utf8 = check_utf8
    )
  ))
}

#' Arrow CSV and JSON table readers
#'
#' These methods wrap the Arrow C++ CSV and JSON table readers.
#' For an interface to the CSV reader that's more familiar for R users, see
#' [read_csv_arrow()]
#'
#' @param file A character path to a local file, or an Arrow input stream
#' @param read_options see [csv_read_options()]
#' @param parse_options see [csv_parse_options()]
#' @param convert_options see [csv_convert_options()]
#' @param ... additional parameters.
#'
#' @return An `arrow::csv::TableReader` or `arrow::json::TableReader` R6
#' object. Call `$Read()` on it to get an Arrow Table.
#' @export
csv_table_reader <- function(file,
  read_options = csv_read_options(),
  parse_options = csv_parse_options(),
  convert_options = csv_convert_options(),
  ...
){
  UseMethod("csv_table_reader")
}

#' @export
csv_table_reader.default <- function(file,
  read_options = csv_read_options(),
  parse_options = csv_parse_options(),
  convert_options = csv_convert_options(),
  ...
) {
  abort("unsupported")
}

#' @export
`csv_table_reader.character` <- function(file,
  read_options = csv_read_options(),
  parse_options = csv_parse_options(),
  convert_options = csv_convert_options(),
  ...
){
  csv_table_reader(fs::path_abs(file),
    read_options = read_options,
    parse_options = parse_options,
    convert_options = convert_options,
    ...
  )
}

#' @export
`csv_table_reader.fs_path` <- function(file,
  read_options = csv_read_options(),
  parse_options = csv_parse_options(),
  convert_options = csv_convert_options(),
  ...
){
  csv_table_reader(mmap_open(file),
    read_options = read_options,
    parse_options = parse_options,
    convert_options = convert_options,
    ...
  )
}

#' @export
`csv_table_reader.arrow::io::InputStream` <- function(file,
  read_options = csv_read_options(),
  parse_options = csv_parse_options(),
  convert_options = csv_convert_options(),
  ...
){
  shared_ptr(`arrow::csv::TableReader`,
    csv___TableReader__Make(file, read_options, parse_options, convert_options)
  )
}

#' @export
`csv_table_reader.arrow::csv::TableReader` <- function(file,
  read_options = csv_read_options(),
  parse_options = csv_parse_options(),
  convert_options = csv_convert_options(),
  ...
){
  file
}
